rm(list = ls())

library(tidyverse)
library(stargazer)
library(lfe)
library(broom) #for tidy
library(lmtest) # For coeftest()
library(sandwich) # For sandwich()
library(estimatr)
library(texreg)
library(psych)
library(default)
library(sjmisc) #for row_sums
library(quantreg)
library(scales) #for pretty_breaks
options(tibble.print_min = 10)
source("functions.R")

load("./data_output/persons_years.Rdata")

# Restrict years
full_sample = dta %>%
  filter(ika > 35, ika < 40) %>% 
  filter(vuosi > 2000, vuosi < 2016)

# Cohort structure
full_sample %>% count(vuosi, synvuosi, ika) %>% print(n=100)
# Most NA earnings are unemployed or retired --> inc_labor = 0
full_sample %>% filter(is.na(inc_labor)) %>% count(ptoim)

# Define earnings
full_sample = full_sample %>%
  mutate(inc_labor = ifelse(is.na(inc_labor), 0, inc_labor)) %>%
  group_by(vuosi) %>% 
  mutate(inc_prop = inc_labor/mean(inc_labor, na.rm = T),
         inc_log = ifelse(inc_labor == 0, NA, log(inc_labor)),
         inc_rank = percent_rank(inc_labor),
         extensive = if_else(inc_log > 9 & emp_11 == T, T, F, missing = F)) %>% ungroup


save(full_sample, file = 'data_output/persons_years_sample.Rdata')

# Individual level data reconstructed from panel to include educ_y
load("./data_output/persons_years.Rdata")
persons = dta %>% 
  #filter(ika < 40) %>% 
  arrange(shnro, abs(ika-39), ika) %>% # pick first age closes to 39 and then younger of the two (38 instead of 40) 
  group_by(shnro) %>% 
  filter(row_number() == 1) %>% ungroup

save(persons, file = 'data_output/persons_sample.Rdata')
